Author

Nathan Herling

Published

June 20, 2025

Code
#--------------------->
#################
# Package Setup #
#################
#Check if pacman [package manager] is installed, if not install it.
#throw [FYI] alert either way.
if (!requireNamespace("pacman", quietly = TRUE)) {
  message("Installing 'pacman' (not found locally)...")
  install.packages("pacman")
} else {
  message("[FYI]\n'pacman' already installed — skipping install.")
}
[FYI]
'pacman' already installed — skipping install.
Code
# use this line for installing/loading
# pacman::p_load()
# - packages to load stored in a variable (vector)
pkgs <- c(
  "tidyverse",
  "glue",
  "scales",
  "lubridate",
  "patchwork",
  "ggh4x",
  "ggrepel",
  "openintro",
  "ggridges",
  "dsbox",
  "janitor",
  "here",
  "knitr",
  "ggthemes",
  "ggplot2",
  "kableExtra",
  "palmerpenguins",
  "grid",
  "htmltools",
  "plotly",
  "ggforce",
  "cowplot",
  "magick",
  "forcats",
  "stringr"
)
# - load from the character array/vector
pacman::p_load(char=pkgs)

# - install tidyverse/dsbox directly from Git Hub
# - this allows for the possible need to install on a repo. pull.
# - and, if it's already installed just thorw an alert.
if (!requireNamespace("dsbox", quietly = TRUE)) {
  message("Installing 'dsbox' from GitHub (not found locally)...")
  suppressMessages(devtools::install_github("tidyverse/dsbox"))
} else {
  message("[FYI]\n'dsbox' already installed — skipping GitHub install.")
}
[FYI]
'dsbox' already installed — skipping GitHub install.
Code
# - alert to user packages loaded.
# Set number of columns (adjustable)
n_cols <- 4

# Add * to each package name
pkgs <- paste0("* ", pkgs)

# Calculate number of rows based on total packages
n_rows <- ceiling(length(pkgs) / n_cols)

# Pad with empty strings to complete grid
pkgs_padded <- c(pkgs, rep("", n_rows * n_cols - length(pkgs)))

# Create matrix (fill by row)
pkg_matrix <- matrix(pkgs_padded, nrow = n_rows, byrow = TRUE)

# Print header
cat("The packages loaded:")
The packages loaded:
Code
# Loop and print each row (use invisible to suppress NULL)
invisible(apply(pkg_matrix, 1, function(row) {
  cat(paste(format(row, width = 22), collapse = ""), "\n")
}))
* tidyverse           * glue                * scales              * lubridate            
* patchwork           * ggh4x               * ggrepel             * openintro            
* ggridges            * dsbox               * janitor             * here                 
* knitr               * ggthemes            * ggplot2             * kableExtra           
* palmerpenguins      * grid                * htmltools           * plotly               
* ggforce             * cowplot             * magick              * forcats              
* stringr                                                                                
Code
#-------------------------->
######################
# Basic set Theme up #
######################
# ---- set theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

# set width of code output
options(width = 65)

# set figure parameters for knitr
knitr::opts_chunk$set(
  fig.width = 7,        # 7" width
  fig.asp = 0.618,      # the golden ratio
  fig.retina = 3,       # dpi multiplier for displaying HTML output on retina
  fig.align = "center", # center align figures
  dpi = 300             # higher dpi, sharper image
)
## ---- end theme set up

(1) - function block

Q5 ….

Code
# ............ A function block, to handle Q3,Q4 with minimal code duplication

# - size as a variable
set_dot_size <- 1
# Function for the "All" group plot (g0)
plot_all <- function(data) {
  ggplot(data, aes(x = explanatory_value, y = mean)) +
    geom_errorbar(aes(ymin = low, ymax = high), width = 0.2) +
    geom_point(size = set_dot_size, color = "black") +
    coord_flip() +
    facet_grid(
      rows = vars(explanatory),
      cols = vars(response),
      labeller = labeller(
        response = as_labeller(response_labels),
        explanatory = as_labeller(explanatory_labels)
      )
    ) +
    theme_minimal(base_size = 11) +
    labs(
      title = "COVID-19 Vaccine Attitudes by Demographic Group",
      x = NULL,
      y = NULL
    ) +
    theme(
      plot.title = element_text(hjust = 0.5),
      strip.background = element_rect(fill = strip_fill_color, color = "black"),
      strip.placement = strip_placement,
      strip.text.x = element_text(
        vjust = 0.5,
        size = strip_text_size,
        margin = margin(t = 20, b = 10, r = 5, l = 5)
      ),
      strip.text.y.right = element_text(
        angle = 0,
        vjust = 0.5,
        margin = margin(t = 10, b = 10, r = 15, l = 15)
      ),
      axis.text.y = element_blank(),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank()
    )
}

# Function for the Age plot (g1)
plot_age <- function(data) {
  ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +
    geom_errorbar(aes(ymin = low, ymax = high), width = 0.2) +
    geom_point(size = set_dot_size, color = "black") +
    coord_flip() +
    facet_grid(
      rows = vars(explanatory),
      cols = vars(response),
      labeller = labeller(
        explanatory = as_labeller(explanatory_labels)
      )
    ) +
    theme_minimal(base_size = 12) +
    labs(
      x = NULL,
      y = NULL
    ) +
    theme(
      strip.background = element_rect(fill = strip_fill_color, color = "black"),
      strip.placement = strip_placement,
      strip.text.x = element_blank(),
      strip.text.y.right = element_text(
        angle = 0,
        vjust = 0.5,
        margin = margin(t = 10, b = 10, r = 13, l = 13)
      ),
      axis.text.y = element_text(size = 10),
      panel.spacing = unit(1, "lines"),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank()
    )
}

# Function for the Gender plot (g2)
plot_gender <- function(data) {
  ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +
    geom_errorbar(aes(ymin = low, ymax = high), width = 0.2) +
    geom_point(size = set_dot_size, color = "black") +
    coord_flip() +
    facet_grid(
      rows = vars(explanatory),
      cols = vars(response),
      labeller = labeller(
        explanatory = as_labeller(explanatory_labels)
      )
    ) +
    theme_minimal(base_size = 12) +
    labs(
      x = NULL,
      y = NULL
    ) +
    theme(
      strip.background = element_rect(fill = strip_fill_color, color = "black"),
      strip.placement = strip_placement,
      strip.text.x = element_blank(),
      strip.text.y.right = element_text(
        angle = 0,
        vjust = 0.5,
        margin = margin(t = 10, b = 10, r = 5, l = 6)
      ),
      axis.text.y = element_text(size = 10),
      axis.text.x = element_blank(),
      panel.spacing = unit(1, "lines"),
      axis.ticks.x = element_blank()
    )
}

# Function for the Race plot (g3)
plot_race <- function(data) {
  ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +
    geom_errorbar(aes(ymin = low, ymax = high), width = 0.2) +
    geom_point(size = set_dot_size, color = "black") +
    coord_flip() +
    facet_grid(
      rows = vars(explanatory),
      cols = vars(response),
      labeller = labeller(
        explanatory = as_labeller(explanatory_labels)
      )
    ) +
    theme_minimal(base_size = 12) +
    labs(
      x = NULL,
      y = NULL
    ) +
    theme(
      strip.background = element_rect(fill = strip_fill_color, color = "black"),
      strip.placement = strip_placement,
      strip.text.x = element_blank(),
      strip.text.y.right = element_text(
        angle = 0,
        vjust = 0.5,
        margin = margin(t = 10, b = 10, r = 10, l = 10)
      ),
      axis.text.y = element_text(size = 10),
      panel.spacing = unit(1, "lines"),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank()
    )
}

# Function for the Ethnicity plot (g4)
plot_ethnicity <- function(data, sub_title_specific) {
  ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +
    geom_errorbar(aes(ymin = low, ymax = high), width = 0.2) +
    geom_point(size = set_dot_size, color = "black") +
    coord_flip() +
    facet_grid(
      rows = vars(explanatory),
      cols = vars(response),
      labeller = labeller(
        explanatory = as_labeller(explanatory_labels)
      )
    ) +
    theme_minimal(base_size = 10) +
    labs(
      x = NULL,
      y = paste0("Mean Likert score\n(Error bars: ", sub_title_specific, ")")
    ) +
    theme(
      strip.background = element_rect(fill = strip_fill_color, color = "black"),
      strip.placement = strip_placement,
      strip.text.x = element_blank(),
      strip.text.y.right = element_text(
        angle = 0,
        vjust = 0.5,
        margin = margin(t = 10, b = 10, r = 4, l = 7)
      ),
      axis.text.y = element_text(size = 10),
      axis.text.x = element_text(size = 10),
      axis.ticks.x = element_line(),
      panel.spacing = unit(1, "lines")
    )
}
# ..... prepare the variables.
# . ethnicity.
filter_ethnicity_data <- function(data) {
  data %>%
    filter(explanatory == "exp_ethnicity") %>%
    filter(is.finite(mean), is.finite(low), is.finite(high)) %>%
    mutate(
      explanatory_value = recode(as.character(explanatory_value),
                                 "1" = "Hispanic/Latino",
                                 "2" = "Non-Hispanic/Non-Latino"),
      explanatory_value = factor(explanatory_value, levels = c(
        "Hispanic/Latino", "Non-Hispanic/Non-Latino"
      )),
      explanatory = factor(explanatory, levels = c(
        "All", "exp_age_bin", "exp_gender", "exp_race", "exp_ethnicity"
      ))
    )
}

# . age
filter_age_data <- function(data) {
  data %>%
    filter(explanatory == "exp_age_bin") %>%
    filter(is.finite(mean), is.finite(low), is.finite(high)) %>%
    mutate(
      explanatory_value = recode(as.character(explanatory_value),
                                 "0" = "<20",
                                 "20" = "21-25",
                                 "25" = "26-30",
                                 "30" = ">30"
      ),
      explanatory_value = factor(explanatory_value, levels = c("<20", "21-25", "26-30", ">30")),
      explanatory = factor(explanatory, levels = c("All", "exp_age_bin", "exp_gender", "exp_race"))
    )
}


# . gender
filter_gender_data <- function(data) {
  data %>%
    filter(explanatory == "exp_gender") %>%
    filter(is.finite(mean), is.finite(low), is.finite(high)) %>%
    mutate(
      explanatory_value = as.character(explanatory_value),
      explanatory_value = fct_recode(factor(explanatory_value),
        "Prefer not to say" = "4",
        "Non-binary third gender" = "3",
        "Male" = "0",
        "Female" = "1"
      ),
      explanatory_value = factor(explanatory_value, levels = rev(c(
        "Prefer not to say",
        "Non-binary third gender",
        "Male",
        "Female"
      ))),
      explanatory = factor(explanatory, levels = c("All", "exp_age_bin", "exp_gender", "exp_race"))
    )
}


# . race
filter_race_data <- function(data) {
  data %>%
    filter(explanatory == "exp_race") %>%
    filter(is.finite(mean), is.finite(low), is.finite(high)) %>%
    mutate(
      explanatory_value = recode(as.character(explanatory_value),
        "1" = "American Indian/Alaska Native",
        "2" = "Asian",
        "3" = "Black/African American",
        "4" = "Native Hawaiian/Other Pacific Islander",
        "5" = "White"
      ),
      explanatory_value = factor(explanatory_value, levels = rev(c(
        "White",
        "Native Hawaiian/Other Pacific Islander",
        "Black/African American",
        "Asian",
        "American Indian/Alaska Native"
      ))),
      explanatory = factor(explanatory, levels = c("All", "exp_age_bin", "exp_gender", "exp_race"))
    )
}

1 - Du Bois challenge.

Du Bois challenge. Recreate the following visualization by W.E.B. Du Bois on family budgets split by income classes for 150 families in Atlanta, Georgia. This visualization was originally created using ink and watercolors.

Note: Since there appears to be some allowable creativity with the features reperesented. I left a scale on the bottom of the parchment, and left off the ‘connecting lines’ connecting the same colored segments together for the stacked bar charts. It ended up being a lot of code - to separately construct and place all pieces of the chart together. First effort. There may be a more efficient way to re-make the plot? . I rendered the output image as html - and I cannot git rid of the small ‘png 2’ label (atm).

png 2

2 - COVID survey - interpretation

Q2 - Interpret what’s occurring in the survey, and discuss any results that go against your intuition.
In a chart this large, “interpret” (as opposed to simply describing) really means identifying trends in the data.
1. Trust and Profession:
Medical students showed noticeably more variance in agreement (i.e. a broader distribution of Likert scores) than nursing students with the statement “I trust the information that I have received about the vaccines.” This could be interpreted as a difference in behavior/diagnostic techniques between the two professions.
2. Concern and Age:
Of note was the general ambivalence towards ‘safety and side effects’, as represented by the average score of ‘3’ across all age groups. While the distribution was wide for all age groups, the general consensus among those in the medical profession was no concern for the safety involved with the vaccine.
3. Vaccination History and Vaccine Perception:
Across the board - high ‘strongly agree’ - to the category of ‘I will recommend the vaccine to family, friends, and community members.’ This would show a very strong trend of trusting the science in the medical community.

4. Counter intuitive/interesting
One interesting trend was that responses by gender identity showed subtle but noteworthy divergence. For some response variables—especially those touching on vaccine efficacy or side effects—nonbinary respondents’ scores tended to fall at the extremes, reflecting either more trust or more concern.

Another pattern emerged in the age dimension, with younger respondents appearing to show greater trust in vaccines.

Overall, it’s striking that the medical and nursing student community was not in lockstep regarding their interpretations of the science and its safety.

Code
#------- no code necessary ..

3 - COVID survey - reconstruct

Q3 ….

Data Analysis - Q1
📄 The original data frame (raw_preview) has:
- 1123 rows
- 14 columns
Table 1. Dataset Missing Value Diagnostics
Metric Value
Total % of values missing 8.65
Percent of rows with ≥1 NA 17.4
Row indices with ≥1 NA (first 20) 3, 37, 41, 53, 61, 67, 76, 88, 98, 103, 114, 115, 116, 118, 150, 151, 152, 153, 154, 155
Percent of rows with >1 NA 12.13
Row indices with >1 NA (first 20) 3, 53, 114, 115, 116, 118, 151, 152, 153, 154, 155, 208, 216, 299, 343, 345, 346, 347, 374, 403
✅ Rows with only `response_id` and all other fields missing have been removed.
Original dataset rows: 1121
Rows removed: 10
Cleaned dataset size: 1111 rows × 14 columns


**Rows_Removed**
row:3 row:152 row:153 row:414
row:529 row:556 row:577 row:835
row:987 row:1050
Code
# - Step 1a: print the dim of the original df.
original_dim <- dim(raw_preview)

cat(glue(
  "📄 The original data frame (`raw_preview`) has:\n",
  "- {original_dim[1]} rows\n",
  "- {original_dim[2]} columns\n\n",
  "⚠️  Rows with no available data (i.e., only `response_id` present)\n will be removed in preprocessing.\n",
  "\n✅ **New Dimensions** of `survey_clean` after cleaning:\n",
  "📊 Rows: {nrow(survey_clean)}\n",
  "📐 Columns: {ncol(survey_clean)}\n"
))
📄 The original data frame (`raw_preview`) has:
- 1123 rows
- 14 columns

⚠️  Rows with no available data (i.e., only `response_id` present)
 will be removed in preprocessing.

✅ **New Dimensions** of `survey_clean` after cleaning:
📊 Rows: 1111
📐 Columns: 14
Code
#-- ... --- based on info in pdf file and .csv .. encode the following
# exp_profession........... 
# exp_flu_vax.............. 
# exp_gender............... Q2 What is your gender? 
# exp_race................. Q3 What is your race?   
# exp_ethnicity............ Q4 What is your ethnicity?  
# exp_age_bin.............. Q1 What is your age?    
# exp_already_vax.......... 
# resp_safety.............. Q26 Based on my understanding, I believe the vaccine is safe.   
# resp_confidence_science.. Q34 I am confident in the scientific vetting process for the new COVID vaccines.    
# resp_concern_safety...... Q27 I am concerned about the safety and side effects of the vaccine.    
# resp_feel_safe_at_work... Q28 Getting the vaccine will make me feel safer at work.    
# resp_will_recommend...... Q29 I will recommend the vaccine to family, friends, and community members. 
# resp_trust_info.......... Q31 I trust the information that I have received about the COVID-19 vaccines.

covid_survey_longer <- survey_clean |>
  pivot_longer(
    cols = starts_with("exp_"),
    names_to = "explanatory",
    values_to = "explanatory_value"
  ) |>
  mutate(explanatory_value = as.factor(explanatory_value)) |>
  filter(!is.na(explanatory_value)) |>
  pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
  )

print(covid_survey_longer)
# A tibble: 43,428 × 5
   response_id explanatory    explanatory_value response         
         <dbl> <chr>          <fct>             <chr>            
 1           1 exp_profession 1                 resp_safety      
 2           1 exp_profession 1                 resp_confidence_…
 3           1 exp_profession 1                 resp_concern_saf…
 4           1 exp_profession 1                 resp_feel_safe_a…
 5           1 exp_profession 1                 resp_will_recomm…
 6           1 exp_profession 1                 resp_trust_info  
 7           1 exp_flu_vax    1                 resp_safety      
 8           1 exp_flu_vax    1                 resp_confidence_…
 9           1 exp_flu_vax    1                 resp_concern_saf…
10           1 exp_flu_vax    1                 resp_feel_safe_a…
# ℹ 43,418 more rows
# ℹ 1 more variable: response_value <dbl>
Q3 code explanation:

covid_survey_longer <- covid_survey |> pivot_longer( cols = starts_with(“exp_”), names_to = “explanatory”, values_to = “explanatory_value” ) |> filter(!is.na(explanatory_value)) |> pivot_longer( cols = starts_with(“resp_”), names_to = “response”, values_to = “response_value” )


first pivot_longer():
Converts all columns that start with “exp_” (e.g., exp_profession, exp_gender, etc.) from wide format into long format.
Creates two new columns:
explanatory: holds the original column names (like “exp_profession”)
explanatory_value: holds the actual values from those columns (like “Nursing” or “1”)
second pivot_longer():
After already pivoting the explanatory variables, this takes the remaining
response variables (resp_safety, resp_confidence_science, etc.) and pivots them long as well.
Creates two new columns:
response: original column name
response_value: corresponding value

create the df/tibble: covid_survey_summary_stats_by_group

Code
# - group the data - by explanatory, explanatory_value, and response calc.
# - the following stats:
# - mean of the response_value
# - low 10th percentile of the response_value
# - high 90th percentile of the response_value
# - rename the df coivd_survey_summart_stats_by_group
covid_survey_summary_stats_by_group <- covid_survey_longer |>
  group_by(explanatory, explanatory_value, response) |>
  summarise(
    mean = mean(response_value, na.rm = TRUE),
    low = quantile(response_value, probs = 0.10, na.rm = TRUE),
    high = quantile(response_value, probs = 0.90, na.rm = TRUE),
    .groups = "drop"
  )

print(covid_survey_summary_stats_by_group)
# A tibble: 126 × 6
   explanatory explanatory_value response        mean   low  high
   <chr>       <fct>             <chr>          <dbl> <dbl> <dbl>
 1 exp_age_bin 0                 resp_concern_…  3.35     2   4.4
 2 exp_age_bin 0                 resp_confiden…  1.65     1   2.4
 3 exp_age_bin 0                 resp_feel_saf…  1.71     1   3.8
 4 exp_age_bin 0                 resp_safety     1.41     1   2  
 5 exp_age_bin 0                 resp_trust_in…  1.41     1   2  
 6 exp_age_bin 0                 resp_will_rec…  1.35     1   1.8
 7 exp_age_bin 20                resp_concern_…  3.32     2   5  
 8 exp_age_bin 20                resp_confiden…  1.31     1   2  
 9 exp_age_bin 20                resp_feel_saf…  1.20     1   2  
10 exp_age_bin 20                resp_safety     1.95     1   5  
# ℹ 116 more rows
Code
#View(covid_survey_summary_stats_by_group)

create the df/tibble: covid_survey_summary_stats_all

Code
library(dplyr)

covid_survey_summary_stats_all <- covid_survey_longer |>
  group_by(response) |>
  summarise(
    mean = mean(response_value, na.rm = TRUE),
    low = quantile(response_value, probs = 0.10, na.rm = TRUE),
    high = quantile(response_value, probs = 0.90, na.rm = TRUE),
    explanatory = "All",
    explanatory_value = factor(""),
    .groups = "drop"
  )

print(covid_survey_summary_stats_all)
# A tibble: 6 × 6
  response         mean   low  high explanatory explanatory_value
  <chr>           <dbl> <dbl> <dbl> <chr>       <fct>            
1 resp_concern_s…  3.28     1     5 All         ""               
2 resp_confidenc…  1.43     1     2 All         ""               
3 resp_feel_safe…  1.36     1     2 All         ""               
4 resp_safety      2.03     1     5 All         ""               
5 resp_trust_info  1.40     1     2 All         ""               
6 resp_will_reco…  1.21     1     2 All         ""               
Code
#View(covid_survey_summary_stats_all)

Bind the two df’s
create the df/tibble: covid_summary_of_stats

Code
# Get existing levels from grouped data
age_levels <- levels(covid_survey_summary_stats_by_group$explanatory_value)

# Add a new level to represent the 'All' group
age_levels_with_all <- c(age_levels, "")

# Create the all-summary with the new factor level
covid_survey_summary_stats_all <- covid_survey_longer |>
  group_by(response) |>
  summarise(
    mean = mean(response_value, na.rm = TRUE),
    low = quantile(response_value, probs = 0.10, na.rm = TRUE),
    high = quantile(response_value, probs = 0.90, na.rm = TRUE),
    explanatory = "All",
    explanatory_value = factor("", levels = age_levels_with_all),
    .groups = "drop"
  )

# Ensure grouped summary has the same levels too
covid_survey_summary_stats_by_group$explanatory_value <- factor(
  covid_survey_summary_stats_by_group$explanatory_value,
  levels = age_levels_with_all
)

# Bind them safely now
covid_survey_summary_stats <- bind_rows(
  covid_survey_summary_stats_all,
  covid_survey_summary_stats_by_group
)

print(covid_survey_summary_stats)
# A tibble: 132 × 6
   response        mean   low  high explanatory explanatory_value
   <chr>          <dbl> <dbl> <dbl> <chr>       <fct>            
 1 resp_concern_…  3.28     1   5   All         ""               
 2 resp_confiden…  1.43     1   2   All         ""               
 3 resp_feel_saf…  1.36     1   2   All         ""               
 4 resp_safety     2.03     1   5   All         ""               
 5 resp_trust_in…  1.40     1   2   All         ""               
 6 resp_will_rec…  1.21     1   2   All         ""               
 7 resp_concern_…  3.35     2   4.4 exp_age_bin "0"              
 8 resp_confiden…  1.65     1   2.4 exp_age_bin "0"              
 9 resp_feel_saf…  1.71     1   3.8 exp_age_bin "0"              
10 resp_safety     1.41     1   2   exp_age_bin "0"              
# ℹ 122 more rows

Q3e - recreate plot

Code
# Labels for rows (explanatory variables), including Gender and Race
explanatory_labels <- c(
  All = "All",
  exp_age_bin = "Age",
  exp_gender = "Gender",
  exp_race = "Race"    ,# Added Race label
  exp_ethnicity = "Ethnicity"
)

# - call formatting for encoded data
covid_age_only <- filter_age_data(covid_survey_summary_stats_by_group)

covid_gender_only <- filter_gender_data(covid_survey_summary_stats_by_group)

covid_race_only <- filter_race_data(covid_survey_summary_stats_by_group)

covid_ethnicity_only <- filter_ethnicity_data(covid_survey_summary_stats_by_group)

# Label mappings for response
response_labels <- c(
  resp_safety = "Vaccine is safe",
  resp_feel_safe_at_work = "Feel safer\n at work",
  resp_concern_safety = "Concern about \nvaccine safety",
  resp_confidence_science = "Confidence in \nscientific vetting",
  resp_trust_info = "Trust in \nvaccine info",
  resp_will_recommend = "Will recommend\nvaccine"
)

# Reorder response factor levels to match response_labels
covid_age_only <- covid_age_only %>%
  mutate(response = factor(response, levels = names(response_labels)))

covid_gender_only <- covid_gender_only %>%
  mutate(response = factor(response, levels = names(response_labels)))

# View distinct codes used in the exp_ethnicity variable
covid_survey_summary_stats_by_group %>%
  filter(explanatory == "exp_ethnicity") %>%
  mutate(explanatory_value = as.character(explanatory_value)) %>%
  distinct(explanatory_value) %>%
  arrange(explanatory_value)
# A tibble: 2 × 1
  explanatory_value
  <chr>            
1 1                
2 2                
Code
# Vector controlling heights of each row - add height for race
row_heights <- c(
  0.5,  # - 'All' row height — adjust as needed
  3,    # - 'exp_age_bin' row height
  3,    # - 'exp_gender' row height - adjust as desired
  3,    # - 'exp_race' row height - new Race row
  3     # - ethnicity
)

# Reorder response factor levels for 'All' layer
covid_all_only <- covid_survey_summary_stats_all %>%
  filter(is.finite(mean), is.finite(low), is.finite(high)) %>%
  mutate(response = factor(response, levels = names(response_labels)))

# - vars for standardizing box size row/col
# Define variables for strip appearance
strip_fill_color <- "gray90"
strip_text_color <- "black"
strip_text_size <- 10
strip_text_face <- "plain"
strip_text_angle_x <- 0
strip_text_angle_y <- 0
strip_text_vjust_y <- 0.5
strip_placement <- "outside"  # already used in your code


# Call some functions
g0 <- plot_all(covid_all_only)
# - second layer - Age
g1 <- plot_age(covid_age_only)
# - third layer - gender
g2 <- plot_gender(covid_gender_only)
# Fourth layer - Race
g3 <- plot_race(covid_race_only)
# Fifth layer: Ethnicity (if present)
g4 <- plot_ethnicity(covid_ethnicity_only,"Error bars in range from 10th to 90th percentile")

# Composite plot with 5 layers stacked (All / Age / Gender / Race / Ethnicity)
composite_plot <- (g0 / g1 / g2 / g3 / g4 + plot_layout(heights = row_heights)) &
  theme(plot.margin = margin(0, 0, 0, 0))

print(composite_plot)

4 - COVID survey - re-reconstruct

Q4 ….Make Plot from Q3, but use different end point quarantiles.
When the error bars represent the 25th and 75th percentiles instead of the 10th and 90th, the intervals become narrower, reflecting a tighter range around the median of the data. This change reduces the apparent variability and uncertainty in responses. Compared to the previous plot, the shorter error bars may make the group differences appear more precise but potentially understate the true variability. Therefore, while the overall trends remain similar, conclusions about the degree of uncertainty should be adjusted to recognize that the interquartile range excludes more extreme values.

# A tibble: 132 × 6
   response        mean   low  high explanatory explanatory_value
   <chr>          <dbl> <dbl> <dbl> <chr>       <fct>            
 1 resp_concern_…  3.28     2     4 All         ""               
 2 resp_confiden…  1.43     1     2 All         ""               
 3 resp_feel_saf…  1.36     1     1 All         ""               
 4 resp_safety     2.03     1     3 All         ""               
 5 resp_trust_in…  1.40     1     2 All         ""               
 6 resp_will_rec…  1.21     1     1 All         ""               
 7 resp_concern_…  3.35     2     4 exp_age_bin "0"              
 8 resp_confiden…  1.65     1     2 exp_age_bin "0"              
 9 resp_feel_saf…  1.71     1     2 exp_age_bin "0"              
10 resp_safety     1.41     1     2 exp_age_bin "0"              
# ℹ 122 more rows

5 - COVID survey - another view

Q5 ….
COVID survey - another view. Create two bar charts of the Likert data for the six survey questions in from the plot in Exercise 2.
This should be a single plot visualizing the percentages of each possible answer, with different questions on the y-axis. Use an appropriate color scale.
Write alt text for your visualization as well.

Code
library(ggplot2)
library(dplyr)
library(forcats)
library(viridis)
Loading required package: viridisLite

Attaching package: 'viridis'
The following object is masked from 'package:scales':

    viridis_pal
Code
# Define response labels
response_labels <- c(
  resp_safety = "Vaccine is safe",
  resp_feel_safe_at_work = "Feel safer\n at work",
  resp_concern_safety = "Concern about \nvaccine safety",
  resp_confidence_science = "Confidence in \nscientific vetting",
  resp_trust_info = "Trust in \nvaccine info",
  resp_will_recommend = "Will recommend\nvaccine"
)

# Step 1: Compute % response per question and response_value
likert_summary <- covid_survey_longer %>%
  group_by(response, response_value) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(response) %>%
  mutate(percent = count / sum(count) * 100) %>%
  ungroup() %>%
  mutate(
    response = factor(response, levels = unique(response)),  # preserve order
    response_value = factor(response_value, levels = 1:5)
  )

# Center percentages for diverging bar chart (v1)
likert_summary <- likert_summary %>%
  mutate(
    centered_percent = case_when(
      response_value < 3 ~ -percent,
      response_value == 3 ~ 0,
      response_value > 3 ~ percent
    )
  )
Warning: There were 2 warnings in `mutate()`.
The first warning was:
ℹ In argument: `centered_percent = case_when(...)`.
Caused by warning in `Ops.factor()`:
! '<' not meaningful for factors
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining
  warning.
Code
# Compute mean Likert per question
mean_scores <- covid_survey_longer %>%
  group_by(response) %>%
  summarise(mean_value = mean(response_value, na.rm = TRUE))

# Recompute likert_summary with deviation × percent
likert_summary <- covid_survey_longer %>%
  group_by(response, response_value) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(response) %>%
  mutate(percent = count / sum(count) * 100) %>%
  ungroup() %>%
  mutate(response_value = as.numeric(as.character(response_value))) %>%
  left_join(mean_scores, by = "response") %>%
  mutate(
    deviation = response_value - mean_value,
    centered_percent = deviation * percent
  )

summary(likert_summary)
   response         response_value     count     
 Length:36          Min.   :1      Min.   :  75  
 Class :character   1st Qu.:2      1st Qu.: 213  
 Mode  :character   Median :3      Median : 346  
                    Mean   :3      Mean   :1206  
                    3rd Qu.:4      3rd Qu.:1484  
                    Max.   :5      Max.   :6077  
                    NA's   :6                    
    percent         mean_value      deviation      
 Min.   : 1.036   Min.   :1.212   Min.   :-2.2777  
 1st Qu.: 2.943   1st Qu.:1.357   1st Qu.:-0.1676  
 Median : 4.780   Median :1.417   Median : 1.2656  
 Mean   :16.667   Mean   :1.786   Mean   : 1.2142  
 3rd Qu.:20.506   3rd Qu.:2.034   3rd Qu.: 2.5919  
 Max.   :83.960   Max.   :3.278   Max.   : 3.7879  
                                  NA's   :6        
 centered_percent 
 Min.   :-59.794  
 1st Qu.: -2.306  
 Median :  4.569  
 Mean   :  0.000  
 3rd Qu.:  6.524  
 Max.   : 49.331  
 NA's   :6        
Code
# Diverging bar chart (centered around mean), with descriptive y-axis labels
g5 <- ggplot(likert_summary, aes(
  x = centered_percent,
  y = fct_rev(fct_relabel(factor(response), ~ response_labels[.x])),
  fill = factor(response_value)
)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_fill_manual(
    values = c("1" = "#d73027", "2" = "#fc8d59", "3" = "#ffffbf", "4" = "#91bfdb", "5" = "#4575b4"),
    name = "Likert response",
    labels = c("Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree")
  ) +
  labs(
    title = "Diverging Bar Chart of Likert Responses Centered Around Question Mean",
    x = "Deviation × Percentage",
    y = "Survey question"
  ) +
  theme_minimal()

print(g5)
Warning: Removed 6 rows containing missing values or values outside the
scale range (`geom_bar()`).

Code
# Ensure response_value is factor for 100% chart
likert_summary <- likert_summary %>%
  mutate(response_value = factor(response_value, levels = 1:5))

# 100% stacked bar chart, with descriptive y-axis labels
g100 <- ggplot(likert_summary, aes(
  x = percent,
  y = fct_rev(fct_relabel(factor(response), ~ response_labels[.x])),
  fill = response_value
)) +
  geom_bar(stat = "identity", width = 0.7) +
  scale_fill_manual(
    values = c("1" = "#d73027", "2" = "#fc8d59", "3" = "#ffffbf", "4" = "#91bfdb", "5" = "#4575b4"),
    name = "Response",
    labels = c("Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree")
  ) +
  labs(
    title = "100% Stacked Bar Chart of Likert Responses",
    x = "Percentage of responses",
    y = "Survey question"
  ) +
  theme_minimal()

print(g100)